In [16]:
#loading dataset
import arff
import math
from numpy import median
from numpy import mean
df=arff.load(open('attachments/trainProdSelection/trainProdSelection.arff','rb'))
train=df['data']

In [17]:
#shuffling dataset and splitting into training and testing sets in the ratio 3:1
from random import shuffle
shuffle(train)
sp=int(0.25*(len(train)))

test = train[:sp]
train=train[sp:]

Pre Processing


In [18]:
#converting categorical string values to integers
#and separating all attributes
dicTyp={"student":0, "engineer":1,"librarian":2,"professor":3,"doctor":4}
dicLs={"spend<<saving":0,"spend<saving":1, "spend>saving":2, "spend>>saving":3}
dicLab={"C1":0,"C2":1,"C3":2,"C4":3,"C5":4}
typ=[]
ls=[]
vac=[]
ec=[]
sal=[]
prp=[]
lab=[]
for i in range(len(train)):
    typ.append(dicTyp[train[i][0]])
    ls.append(dicLs[train[i][1]])
    vac.append(int(round(train[i][2])))
    ec.append(int(round(train[i][3])))
    sal.append(int(round(train[i][4])))
    prp.append(int(round(train[i][5])))
    lab.append(int(round(dicLab[train[i][6]])))
typ1=[]
ls1=[]
vac1=[]
ec1=[]
sal1=[]
prp1=[]
lab1=[]
for i in range(len(test)):
    typ1.append(dicTyp[test[i][0]])
    ls1.append(dicLs[test[i][1]])
    vac1.append(int(round(test[i][2])))
    ec1.append(int(round(test[i][3])))
    sal1.append(int(round(test[i][4])))
    prp1.append(int(round(test[i][5])))
    lab1.append(int(round(dicLab[test[i][6]])))

In [19]:
#method for calculating entrophy of target variable
def mlab():
    count=[1,1,1,1,1,1]
    lab1=[0,0,0,0,0]
    for i in lab:
        lab1[i]=lab1[i]+1
    for i in range(len(lab1)):
        lab1[i]=-1*((lab1[i]/(len(lab)*1.0))*(math.log(lab1[i]/(len(lab)*1.0),2)))
    Hy=sum(lab1)
    return Hy

In [20]:
#method for calculating information gain of type attribute
def mtyp():
    enTyp=0
    for i in range(5):
        l1=[0,0,0,0,0]
        for j in range(len(typ)):
            if(i==typ[j]):
                l1[lab[j]]=l1[lab[j]]+1
        s=sum(l1)*1.0
        lo=0
        for j in range(len(l1)):
            if(l1[j]!=0):
                lo=lo+(-1*(l1[j]/s)*math.log((l1[j]/s),2))
        enTyp=enTyp+((s/(len(typ)*1.0))*lo)
    igTyp=mlab()-enTyp
    return igTyp

In [21]:
#method for calculating information gain of life style attribute
def mls():
    enLs=0
    for i in range(4):
        l1=[0,0,0,0,0]
        for j in range(len(ls)):
            if(i==ls[j]):
                l1[lab[j]]=l1[lab[j]]+1
        s=sum(l1)*1.0
        lo=0
        for j in range(len(l1)):
            if(l1[j]!=0):
                lo=lo+(-1*(l1[j]/s)*math.log((l1[j]/s),2))
        enLs=enLs+((s/(len(ls)*1.0))*lo)
    igLs=mlab()-enLs
    return igLs

In [22]:
#method for calculating information gain of vacation attribute
def mvac():
    m=max(vac)
    l1=[]
    for i in range(m):
        l1.append(0)
    enVac=0
    for i in range(m+1):
        l1=[0,0,0,0,0]
        for j in range(len(vac)):
            if(i==vac[j]):
                l1[lab[j]]=l1[lab[j]]+1
        s=sum(l1)*1.0
        lo=0
        for j in range(len(l1)):
            if(l1[j]!=0):
                lo=lo+(-1*(l1[j]/s)*math.log((l1[j]/s),2))
        enVac=enVac+((s/(len(vac)*1.0))*lo)
    igVac=mlab()-enVac
    return igVac

In [23]:
#method for calculating information gain of e-credit attribute
def mec():
    m=max(ec)
    l1=[]
    for i in range(m):
        l1.append(0)
    enEc=0
    for i in range(m+1):
        l1=[0,0,0,0,0]
        for j in range(len(ec)):
            if(i==ec[j]):
                l1[lab[j]]=l1[lab[j]]+1
        s=sum(l1)*1.0
        lo=0
        for j in range(len(l1)):
            if(l1[j]!=0):
                lo=lo+(-1*(l1[j]/s)*math.log((l1[j]/s),2))
        enEc=enEc+((s/(len(ec)*1.0))*lo)
    igEc=mlab()-enEc
    return igEc

In [24]:
#method for calculating information gain of salary attribute
def msal():
    m=max(sal)
    l1=[]
    for i in range(m):
        l1.append(0)
    enSal=0
    for i in range(m+1):
        l1=[0,0,0,0,0]
        for j in range(len(sal)):
            if(i==sal[j]):
                l1[lab[j]]=l1[lab[j]]+1
        s=sum(l1)*1.0
        lo=0
        for j in range(len(l1)):
            if(l1[j]!=0):
                lo=lo+(-1*(l1[j]/s)*math.log((l1[j]/s),2))
        enSal=enSal+((s/(len(sal)*1.0))*lo)
    igSal=mlab()-enSal
    return igSal

In [25]:
#method for calculating information gain of property attribute
def mprp():
    m=max(prp)
    l1=[]
    for i in range(m):
        l1.append(0)
    enPrp=0
    for i in range(m+1):
        l1=[0,0,0,0,0]
        for j in range(len(prp)):
            if(i==prp[j]):
                l1[lab[j]]=l1[lab[j]]+1
        s=sum(l1)*1.0
        lo=0
        for j in range(len(l1)):
            if(l1[j]!=0):
                lo=lo+(-1*(l1[j]/s)*math.log((l1[j]/s),2))
        enPrp=enPrp+((s/(len(prp)*1.0))*lo)
    igPrp=mlab()-enPrp
    return igPrp
print mtyp()
print mls()
print mvac()
print mec()
print msal()
print mprp()


0.981583202533
0.39361260618
1.66822851745
1.92858384359
0.994292318402
1.01103226666

In [26]:
# root=[1,1,1,1,1,1]

In [27]:
#node class for decision tree whicch contains all attributes information, information gain and data as threshould for splitting
class Tree(object):
    def __init__(self):
        
        self.data = None
        self.classifier=None
        self.typ=[]
        self.ls=[]
        self.vac=[]
        self.ec=[]
        self.sal=[]
        self.prp=[]
        self.lab=[]
        self.attr=[0,0,0,0,0,0]
        self.ig=[0,0,0,0,0,0]
        self.splitAttr=None

Constructing Decision Tree


In [28]:
#creating dictionary which contains index of attribute and method to  calculate information gain based on the index of attribute
dictAttr1={0:mtyp(),1:mls(),2:mvac(),3:mec(),4:msal(),5:mprp()}
# dictAttr2={mtyp():0,mls():1,mvac():2,3:mec(),4:msal(),5:mprp()}

#constrcuting output array for binary tree with number of number nodes as 2 power of attributes
#Root node starts at index 1
#if index parent node is 1 and its childs indices are 2*i and 2*i+1
output=[None]*(2**7)

#rcontructing root node which starts at index 1 in the output array by selecting the max information gain from all the attributes
output[1]=Tree()
output[1].data=mean(ec)
output[1].typ=typ
output[1].ls=ls
output[1].vac=vac
output[1].ec=ec
output[1].sal=sal
output[1].prp=prp
output[1].lab=lab
output[1].attr=[1,1,1,0,1,1]
output[1].ig=[1,1,1,0,1,1]
output[1].splitAttr=3


#splitting the data on the attribute by data(threshould) value and iteratively calculating the information gain on the
#non splitted attrbutes and constructing decision tree until all the attributes are finished 

for i in range(2,(2**7)):
    
    output[i]=Tree()
    for j in range(len(output[i/2].ec)):
        if(i%2==0 and output[i/2].ec[j]<=output[i/2].data):
            output[i].typ.append(output[i/2].typ[j])
            output[i].ls.append(output[i/2].ls[j])
            output[i].vac.append(output[i/2].vac[j])
            output[i].ec.append(output[i/2].ec[j])
            output[i].sal.append(output[i/2].sal[j])
            output[i].prp.append(output[i/2].prp[j])
            output[i].lab.append(output[i/2].lab[j])
        elif(i%2==1 and output[i/2].ec[j]>output[i/2].data):
            output[i].typ.append(output[i/2].typ[j])
            output[i].ls.append(output[i/2].ls[j])
            output[i].vac.append(output[i/2].vac[j])
            output[i].ec.append(output[i/2].ec[j])
            output[i].sal.append(output[i/2].sal[j])
            output[i].prp.append(output[i/2].prp[j])
            output[i].lab.append(output[i/2].lab[j])
    temp=[0]*5
    if(max(output[i/2].attr)==0):
        for j in range(len(output[i].lab)):
            temp[output[i].lab[j]]=temp[output[i].lab[j]]+1
        ind=temp.index(max(temp))
        output[i].classifier=ind
        continue
            
    
    typ=output[i].typ[:]
    ls=output[i].ls[:]
    vac=output[i].vac[:]
    ec=output[i].ec[:]
    sal=output[i].sal[:]
    prp=output[i].prp[:]
    lab=output[i].lab[:]
    output[i].attr=output[i/2].attr[:]
    for j in range(len(output[i/2].attr)):
        if(output[i/2].attr[j]==1):
            output[i].ig[j]=dictAttr1[j]
    ind=output[i].ig.index(max(output[i].ig))
    output[i].attr[ind]=0
    output[i].splitAttr=ind

Calculating accuracy of the model by using testing data


In [29]:
# predicting the classifier of the testing data from the decision tree and counting the number of correct classifications

dictTest={0:typ1,1:ls1,2:vac1,3:ec1,4:sal1,5:prp1}
count=0
for i in range(len(test)):
    ind=1;
    temp=output[ind]
    while(temp.splitAttr!=None):
        if(dictTest[temp.splitAttr][i]<=temp.data):
            temp=output[2*ind]
            ind=ind*2;
        else:
            temp=output[2*ind+1]
            ind=ind*2+1
    if(temp.classifier==lab1[i]):
        count=count+1
        
#calculating the accuracy of the model by the formula (number of true predictions / number of total predictions)
print "The model's accuracy is:",(float(count)/len(test))*100


The model's accuracy is: 52.1739130435

In [ ]:


In [ ]:


In [ ]: